!pip install catboost plotly
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import tqdm
py.init_notebook_mode(connected=True)
EQUAL_ASPECT_RATIO_LAYOUT = dict(
margin={
'l': 0,
'r': 0,
'b': 0,
't': 0
}, scene=dict(
aspectmode='data'
))
def color(x, cmap='Reds'):
cmap = plt.get_cmap(cmap)
x = (x - np.min(x)) / np.max(x)
return cmap(x)
%matplotlib inline
ds = pd.read_csv('./snow.csv')
ds = ds.set_index(['scene_id'])
ds.head()
scene = ds.loc[0]
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
'x': scene.x,
'y': scene.y,
'z': scene.z,
'mode': 'markers',
'marker': {
'size': 1,
'color': color(scene.ring, 'tab20'),
},
'text': scene.ring
})
py.iplot(fig)
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
'x': scene.x,
'y': scene.y,
'z': scene.z,
'mode': 'markers',
'marker': {
'size': 1,
'color': color(scene.intensity, 'seismic'),
},
'text': scene.intensity
})
py.iplot(fig)


scene = ds.loc[1]
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
'x': scene.x,
'y': scene.y,
'z': scene.z,
'mode': 'markers',
'marker': {
'size': 1,
'color': color(scene.intensity, 'seismic'),
},
'text': scene.ring
})
py.iplot(fig)
def filter_by_intensity(intensity, limit=2):
return intensity >= limit
filtered_scene = scene[filter_by_intensity(scene.intensity)]
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
'x': filtered_scene.x,
'y': filtered_scene.y,
'z': filtered_scene.z,
'mode': 'markers',
'marker': {
'size': 1,
'color': color(filtered_scene.intensity, 'seismic'),
},
'text': scene.ring
})
py.iplot(fig)
Плохо и непонятно, будем учить
from sklearn.neighbors import KDTree
class ComputeFeatures(object):
def __init__(self, r=1.0):
self.xyz = None
self.intensity = None
self.ring = None
self.index = None
self.r = r
def _feature_names(self):
return ['number_of_neighbors', 'mean_intensity', 'max_intensity', 'min_intensity', 'std_intensity',
'median_ring', 'max_ring', 'min_ring', 'std_ring']
def compute_point_features(self, point_id, neighbours):
number_of_neighbors = len(neighbours)
mean_intensity, max_intensity, min_intensity, std_intensity = np.mean(self.intensity[neighbours]), \
np.max(self.intensity[neighbours]), np.min(self.intensity[neighbours]), np.std(self.intensity[neighbours])
median_ring, max_ring, min_ring, std_ring = np.median(self.ring[neighbours]), \
np.max(self.ring[neighbours]), np.min(self.ring[neighbours]), np.std(self.ring[neighbours])
return number_of_neighbors, mean_intensity, max_intensity, min_intensity, std_intensity, median_ring,\
max_ring, min_ring, std_ring
def get_point_neighbours(self, point_id):
return self.index.query_radius(self.xyz[point_id][np.newaxis, :], r=self.r)[0]
def __call__(self, xyz, intensity, ring):
self.xyz = xyz[:]
self.intensity = intensity[:]
self.ring = ring[:]
self.index = KDTree(self.xyz)
features = []
for point_id in range(len(self.xyz)):
neighbours = self.get_point_neighbours(point_id)
features.append(self.compute_point_features(point_id, neighbours))
return pd.DataFrame(columns=self._feature_names(), data=features)
# ds_features = pd.read_csv('./snow_features.csv')
# ds_features = ds_features.drop(["Unnamed: 0"], axis=1)
# ds_features.shape
# features = ComputeFeatures(r=1.0)
# for scene_id in tqdm.tqdm(ds.reset_index().scene_id.unique()):
# scene = ds.loc[scene_id]
# features_df = \
# features(scene[['x', 'y', 'z']].values, scene.intensity.values, scene.ring.values)
# features_df.to_csv('./features/{}.csv'.format(scene_id))
scene = ds.loc[1]
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
'x': scene.x,
'y': scene.y,
'z': scene.z,
'mode': 'markers',
'marker': {
'size': 1,
'color': color(scene.label, 'seismic'),
},
'text': scene.label
})
py.iplot(fig)
all_features = []
for scene_id in tqdm.tqdm(ds.reset_index().scene_id.unique()):
features = pd.read_csv('./features/{}.csv'.format(scene_id), index_col=None)
features.drop(['Unnamed: 0'], axis=1, inplace=True)
all_features.append(features)
all_features = pd.concat(all_features, ignore_index=True)
all_features = pd.concat([ds.reset_index(), all_features], axis=1)
all_features
from sklearn.model_selection import train_test_split
# train, test = train_test_split(all_features, test_size = 0.2, shuffle = True)
# val, test = train_test_split(test, test_size = 0.5, shuffle = True)
scenes = list(all_features.scene_id.unique())
train_scenes, test_scenes = train_test_split(scenes, test_size = 0.2, shuffle = True)
validation_scenes, test_scenes = train_test_split(test_scenes, test_size = 0.5, shuffle = True)
train = all_features.loc[all_features.scene_id.isin(train_scenes)]
test = all_features.loc[all_features.scene_id.isin(test_scenes)]
val = all_features.loc[all_features.scene_id.isin(validation_scenes)]
import catboost
def learn(X_train, X_val, y_train, y_val):
clf = catboost.CatBoostClassifier(n_estimators=100)
clf.fit(
X_train, y_train, early_stopping_rounds=10,
use_best_model=True, eval_set=(X_val.values, y_val.values), plot=True, verbose=False)
return clf
X_train = train.drop(["scene_id", "label", "x", "y", "z"], axis=1)
y_train = train.label
X_val = val.drop(["scene_id", "label", "x", "y", "z"], axis=1)
y_val = val.label
del ds
cls = learn(X_train, X_val, y_train, y_val)
X_test = test.drop(['scene_id', 'x', 'y', 'z', 'label'], axis=1)
y_test = test.label
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, auc
def test_one(clf, X_test, y_test):
y_test_hat = clf.predict_proba(X_test)
pr, rec, thr = precision_recall_curve(y_test, y_test_hat[:, 1])
ix = np.linspace(1, len(pr)-1, num=2000).astype(int)
return pr[ix], rec[ix], thr[ix - 1]
def heuristic_filter_scoring():
pr = []
rec = []
filter_range = range(1, 10)
for i in filter_range:
y_test_heuristic_hat = np.ones(len(X_test))
y_test_heuristic_hat[filter_by_intensity(test.intensity, i)] = 0
pr.append(precision_score(y_test, y_test_heuristic_hat))
rec.append(recall_score(y_test, y_test_heuristic_hat))
return pr, rec, filter_range
pr_bl, rec_bl, thr_bl = heuristic_filter_scoring()
def plot_pr_rec(*models):
traces = []
for model, clf, X_test, y_test in models:
pr, rec, thr = test_one(clf, X_test, y_test)
pr_rec = go.Scattergl(x = rec, y = pr, mode='lines', text=thr, name=f'{model}')
traces.append(pr_rec)
print(f"AUC for catboost classifier {model}: ", auc(rec, pr))
pr_rec_bl = go.Scatter(x = rec_bl, y = pr_bl, mode='lines+markers', text=list(map(str, thr_bl)), name='Intensity BL')
layout = go.Layout(
title='Precission-recall',
xaxis=dict(
title='Recall'
),
yaxis=dict(
title='Precission'
))
fig = go.Figure(
data=traces + [pr_rec_bl],
layout=layout)
py.iplot(fig)
models = [('Catboost classifier', cls, X_test, y_test)]
plot_pr_rec(*models)
y_test_hat = cls.predict_proba(test.drop(['scene_id', 'x', 'y', 'z', 'label'], axis=1))
scene_id = 2
scene = test.set_index(['scene_id']).loc[scene_id]
scene_predictions = y_test_hat[test.scene_id == scene_id][:, 1]
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
'x': scene.x,
'y': scene.y,
'z': scene.z,
'mode': 'markers',
'marker': {
'size': 2,
'color': color((np.round(scene_predictions)==scene.label).astype(int), 'tab20b'),
},
'text': [f"true: {true}, predicted: {round(pred)}" for pred,
true in zip(scene_predictions, scene.label)]
})
py.iplot(fig)